home *** CD-ROM | disk | FTP | other *** search
/ Chip: 2005 Utilities / CHIP Utilities 2005.7z / CHIP Utilities 2005.iso / docs / get-ubcd-docs-orig.pl < prev    next >
Perl Script  |  2004-10-31  |  12KB  |  378 lines

  1. #!/usr/bin/perl 
  2. # -w
  3.  
  4. # to do
  5. #
  6. # update freesco
  7. #
  8. # make html print more fields
  9. # tighten code
  10. # perlize for greater cross platform compatability
  11. # add routine to get size of images
  12. # improve descriptions
  13. # fix link building problem that effects 3 links
  14.  
  15. # start in base dir
  16. chdir "/docs_uncompressed";
  17.  
  18. # input file
  19. open(UBCDDOCS,"<ubcd-docs.csv");
  20.  
  21. # files to write
  22. open(UBCDINDEX,">ubcd-index.html");
  23. open(UBCDINDEXTABLE,">ubcd-index-table.html");
  24. open(UBCDINDEXTXT,">ubcd-index.csv");
  25. open(UBCDINDEXXML,">ubcd.xml");
  26. open(UBCDINDEXXST,">ubcd.xsl");
  27. open(UBCDINDEXDTD,">ubcd.dtd");
  28.  
  29. # sort input file by utility name
  30. $srt=`sort ubcd-docs.csv > ubcd-docs.csv.sorted`;
  31. $srt=`cp -f ubcd-docs.csv.sorted ubcd-docs.csv`;
  32. $srt=`rm -f ubcd-docs.csv.sorted`;
  33.  
  34. @data_vars=(utility,doc1,doc1title,doc2,doc2title,doc3,doc3title,doc4,doc4title,doc5,doc5title,doc6,doc6title,doc7,doc7title,webpage,imagename,description,dosapp,category,menu,maintainer,lastupdate,version,size);
  35. @data_vars_enc=(url_orig,utility_enc,doc1_enc,doc1title_enc,doc2_enc,doc2title_enc,doc3_enc,doc3title_enc,doc4_enc,doc4title_enc,doc5_enc,doc5title_enc,doc6_enc,doc6title_enc,doc7_enc,doc7title_enc,webpage_enc,imagename_enc,description_enc,dosapp_enc,category_enc,menu_enc,maintainer_enc,lastupdate_enc,version_enc,size_enc);
  36.  
  37. %titles=(utility => "Utility",doc1 => "Doc #1",doc1title => "Doc #1",doc2 => "Doc #1",doc2title => "Doc #1",doc3 => "Doc #1",doc3title => "Doc #1",doc4 => "Doc #1",doc4title => "Doc #1",doc5 => "Doc #1",doc5title => "Doc #1",doc6 => "Doc #1",doc6title => "Doc #1",doc7 => "Doc #1",doc7title => "Doc #1",webpage => "Doc #1",imagename => "Doc #1",dosapp => "Doc #1",category => "Doc #1",menu => "Doc #1",maintainer => "Doc #1",lastupdate => "Doc #1",version => "Doc #1");
  38. @doc_vars=(doc1,doc2,doc3,doc4,doc5,doc6,doc7,webpage);
  39. $webpagetitle="Web Page";
  40.  
  41. &printdtd;
  42. &printxsl;
  43. &printtitles;
  44.  
  45. print "Gathering Docs\n";
  46. foreach $utility_info (<UBCDDOCS>){
  47.     chomp($utility_info);
  48.     ($utility,$doc1,$doc1title,$doc2,$doc2title,$doc3,$doc3title,$doc4,$doc4title,$doc5,$doc5title,$doc6,$doc6title,$doc7,$doc7title,$webpage,$imagename,$description,$dosapp,$category,$menu,$maintainer,$lastupdate,$version,$size)=split /,/,$utility_info;
  49.     $utility =~ s/"//g;$description =~ s/"//g;$imagename =~ s/"//g;$dosapp =~ s/"//g;$category =~ s/"//g;$menu =~ s/"//g;$maintainer =~ s/"//g;$lastupdate =~ s/"//g;$size =~ s/"//g;$version =~ s/"//g;
  50.     chomp($imagename);
  51.     $imagename_fixed=lc $imagename;
  52.     $file_test="$imagename_fixed".".igz";
  53.     print "looking for [$file_test]\n";
  54.     if (-f "/mnt/disk/images/$file_test"){
  55.         $imagename_fixed="$imagename_fixed".".igz";
  56.     }else{
  57.         $imagename_fixed="$imagename_fixed".".img";    
  58.     }
  59.     
  60.     chomp($size=`du -k /mnt/disk/images/$imagename_fixed|cut -f1`);
  61.     if ($imagename !~ m/[a-z|A-Z]/ or $imagename =~ m/Utility/){next;};
  62.  
  63.     print "\n\n\nGetting docs for [$imagename|$dosapp]\n";
  64.     mkdir $imagename;
  65.     chdir "$imagename";
  66.     print UBCDINDEX qq!$utility   $description   !;
  67.     print UBCDINDEXTXT qq!$utility, $description, !;
  68.     print UBCDINDEXTABLE qq!<tr><td>$utility</td><td>$description</td>!;
  69.  
  70.     &printutilinfo;
  71.  
  72.     foreach $doc_var (@doc_vars){    
  73.         if ($$doc_var =~ m/[a-z|A-Z]/){
  74.             # remove quotes
  75.             $$doc_var =~ s/"//g;
  76.             $doc_url = $$doc_var;
  77.             
  78.             # define other doc vars
  79.             $doc_title_var = "$doc_var" . "title";
  80.  
  81.             # remove quotes
  82.             $$doc_title_var =~ s/"//g;
  83.  
  84.             print "Fetching [$$doc_var]\n";
  85.             system(qq!wget --continue --tries=1 --html-extension --convert-links --page-requisites --user-agent="Mozilla/4.0 (compatable; MSIE 6.0; Windows NT 5.1)" "$$doc_var"!);
  86.             if ($$doc_var =~ m/.pdf$|.PDF$/){
  87.                 $url_orig=$$doc_var;
  88.                 # convert PDFs to html
  89.                 $$doc_var =~ s/http:\//$imagename/g;
  90.                 print "PDF CONVERSION";
  91.                 $cvtoutput=`pdftotext -layout -htmlmeta -eol unix -nopgbrk "/docs_uncompressed/$$doc_var" 2>&1`;
  92.                 print " [$cvtoutput:/docs_uncompressed/$$doc_var]\n";
  93.                 unlink "/docs_uncompressed/$$doc_var";
  94.  
  95.                 # change converted PDF extension to html
  96.                 $$doc_var =~ s/.pdf$|.PDF$/.html/;
  97.     
  98.                 # fix document name & location to be loadable via a browser.
  99.                 $tmp_doc_var = $$doc_var;
  100.                 $$doc_var =~ s/\?|=|&|\+//g;
  101.                 if ($$doc_var ne $tmp_doc_var){
  102.                     chdir "/docs_uncompressed";
  103.                     rename ("/docs_uncompressed/$tmp_doc_var","/docs_uncompressed/$$doc_var");
  104.                     print "File moved [/docs_uncompressed/$tmp_doc_var|/docs_uncompressed/$$doc_var][$!]\n";
  105.                     chdir "/docs_uncompressed";
  106.                 }
  107.                 
  108.                 # write indexes
  109.                 &encode_xml_data;
  110.                 print UBCDINDEX qq!<A HREF="$$doc_var">$$doc_title_var [html]</A>   !;
  111.                 print UBCDINDEXTXT qq!$$doc_title_var, $$doc_var, !;
  112.                 print UBCDINDEXTABLE qq!<td><a href="$$doc_var">$$doc_title_var</a></td>!;
  113.                 &printdoc;
  114.             }else{
  115.                 $url_orig=$$doc_var;
  116.                 # change http: to file:
  117.                 $$doc_var =~ s/http:\//$imagename/g;
  118.  
  119.                 # if the document is a html type document as defined by wget ensure it ends in .html
  120.                 if ($$doc_var !~ m/.aspx$|.htm$|.html$|.faq$|.FAQ$|.lsm$|.txt$|.doc$/ and $$doc_var !~ m/\/$/){$$doc_var = $$doc_var . ".html"};
  121.  
  122.                 # fix document name & location to be loadable via a browser.
  123.                 $tmp_doc_var = $$doc_var;
  124.                 $$doc_var =~ s/\?|=|&|\+//g;
  125.                 if ($$doc_var ne $tmp_doc_var){
  126.                     chdir "/docs_uncompressed";
  127.                     rename ("/docs_uncompressed/$tmp_doc_var","/docs_uncompressed/$$doc_var");
  128.                     print "File moved [/docs_uncompressed/$tmp_doc_var|/docs_uncompressed/$$doc_var][$!]\n";
  129.                     chdir "/docs_uncompressed";
  130.                 }
  131.  
  132.                 # write indexes
  133.                 &encode_xml_data;
  134.                 print UBCDINDEX qq!<A HREF="$$doc_var">$$doc_title_var</A>   !;
  135.                 print UBCDINDEXTXT qq!$$doc_title_var, $$doc_var, !;
  136.                 print UBCDINDEXTABLE qq!<td><a href="$$doc_var">$$doc_title_var</a></td>!;
  137.                 &printdoc;
  138.             }
  139.         }
  140.     }
  141.     print UBCDINDEX qq!<BR>\n!;
  142.     print UBCDINDEXTXT qq!\n!;
  143.     print UBCDINDEXTABLE qq!</tr>\n!;
  144.     print UBCDINDEXXML qq!</utility_info>\n\n!;
  145.     chdir "/docs_uncompressed";
  146. }
  147.  
  148. print UBCDINDEX qq!</body></html>\n!;
  149. print UBCDINDEXTXT qq!End\n!;
  150. print UBCDINDEXTABLE qq!</table></body></html>\n!;
  151. print UBCDINDEXXML qq!</catalog>\n!;
  152.  
  153. chomp($check=`du -bs /docs_uncompressed`);
  154. print "\n\nDocs RAW Size [$check]\n\n";
  155.  
  156. print "Running html tidy on html files\n";
  157. system('find . -name *.html -o -name *.htm -print -exec tidy -modify -upper -quiet -omit -errors {} \; > /dev/null 2>&1');
  158. chomp($check=`du -bs /docs_uncompressed`);
  159. print "Docs after HTML Tidy Size [$check]\n\n";
  160.  
  161. print "Compressing docs_uncompressed to /cmp\n";
  162. $rm_old=`rm -rf /cmp /docs`;
  163. $pack_result=`webpack -b /cmp/`;
  164. chomp($check=`du -bs /cmp`);
  165. print "/cmp compressed Size [$check]\n\n";
  166.  
  167. print "Moving /cmp to /docs\n";
  168. $move=`mv -f /cmp /docs`;
  169.  
  170. print "Archiving docs\n";
  171. $tgz=`tar -czf /docs.tar.gz /docs`;
  172. chomp($tgz_size=`du -bs /docs.tar.gz`);
  173. print "Archive size [$tgz_size]\n\n";
  174.  
  175. print "Done\n\n";
  176. close;
  177. exit;
  178.  
  179. sub encode_xml_data{
  180. $utility_enc=$utility;
  181. $doc1_enc=$doc1;
  182. $doc1title_enc=$doc1title;
  183. $doc2_enc=$doc2;
  184. $doc2title_enc=$doc2title;
  185. $doc3_enc=$doc3;
  186. $doc3title_enc=$doc3title;
  187. $doc4_enc=$doc4;
  188. $doc4title_enc=$doc4title;
  189. $doc5_enc=$doc5;
  190. $doc5title_enc=$doc5title;
  191. $doc6_enc=$doc6;
  192. $doc6title_enc=$doc6title;
  193. $doc7_enc=$doc7;
  194. $doc7title_enc=$doc7title;
  195. $webpage_enc=$webpage;
  196. $imagename_enc=$imagename;
  197. $description_enc=$description;
  198. $dosapp_enc=$dosapp;
  199. $category_enc=$category;
  200. $menu_enc=$menu;
  201. $maintainer_enc=$maintainer;
  202. $lastupdate_enc=$lastupdate;
  203. $version_enc=$version;
  204. $size_enc=$size;
  205.  
  206. foreach $data (@data_vars_enc){ 
  207.     $$data =~ s/</</g;
  208.     $$data =~ s/&/&/g;
  209.     $$data =~ s/>/>/g;
  210.     $$data =~ s/"/"/g;
  211.     $$data =~ s/'/'/g;
  212. }
  213. }
  214.  
  215. sub printtitles{
  216. # print document titles
  217. print UBCDINDEX <<EOF;
  218. <html><head><title>UBCD CD Based Docs - HTML</title></head><body>
  219. <BR>UBCD CD based docs index types: <a href="ubcd-index.html">HTML</a>   <a href="ubcd-index-table.html">HTML Table</a>   <a href="ubcd.xml">XML</a>   <a href="ubcd-index.csv">CSV</a><BR><BR>
  220. <html><head><title>UBCD CD Based Docs - HTML</title></head><body>
  221. UTILITY   DESCRIPTION   DOCUMENTS<BR>
  222. EOF
  223.  
  224. print UBCDINDEXTABLE <<EOF;
  225. <html><head><title>UBCD CD Based Docs - HTML Table</title></head><body>
  226. <BR>UBCD CD based docs index types: <a href="ubcd-index.html">HTML</a>    <a href="ubcd-index-table.html">HTML Table</a>    <a href="ubcd.xml">XML</a>    <a href="ubcd-index.csv">CSV</a><BR><BR>
  227. <table border="1">
  228. <TR><TD>UTILITY</TD><TD>DESCRIPTION</TD><TD>DOCUMENTS</TD></TR>
  229. EOF
  230.  
  231. print UBCDINDEXTXT qq!UBCD CD Based Docs - Text Listing - CSV\n!;
  232. print UBCDINDEXTXT qq!UTILITY,DESCRIPTION,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,WEBPAGE,\n!;
  233. print UBCDINDEXXML <<EOF;
  234. <?xml version="1.0"?>
  235. <?xml-stylesheet type="text/xsl" href="ubcd.xsl"?>
  236. <!DOCTYPE catalog SYSTEM "ubcd.dtd">
  237. <catalog>
  238. EOF
  239. }
  240.  
  241. sub printutilinfo{
  242. &encode_xml_data;
  243. print UBCDINDEXXML <<EOF;
  244. <utility_info>
  245.     <utility>$utility_enc</utility>
  246.     <description>$description_enc</description>
  247.     <imagename>$imagename_enc</imagename>
  248.     <dosapp>$dosapp_enc</dosapp>
  249.     <category>$category_enc</category>
  250.     <menu>$menu_enc</menu>
  251.     <maintainer>$maintainer_enc</maintainer>
  252.     <lastupdate>$lastupdate_enc</lastupdate>
  253.     <size>$size_enc</size>
  254.     <version>$version_enc</version>
  255. EOF
  256. }
  257.  
  258. sub printdoc{
  259.     $$doc_title_var =~ s/</</g;
  260.     $$doc_title_var =~ s/&/&/g;
  261.     $$doc_title_var =~ s/>/>/g;
  262.     $$doc_title_var =~ s/"/"/g;
  263.     $$doc_title_var =~ s/'/'/g;
  264.  
  265.     $$doc_var =~ s/</</g;
  266.     $$doc_var =~ s/&/&/g;
  267.     $$doc_var =~ s/>/>/g;
  268.     $$doc_var =~ s/"/"/g;
  269.     $$doc_var =~ s/'/'/g;
  270.  
  271.     $$doc_url =~ s/</</g;
  272.     $$doc_url =~ s/&/&/g;
  273.     $$doc_url =~ s/>/>/g;
  274.     $$doc_url =~ s/"/"/g;
  275.     $$doc_url =~ s/'/'/g;
  276.  
  277. print UBCDINDEXXML <<EOF;
  278.     <doc>
  279.         <title>$$doc_title_var</title>
  280.         <location>$$doc_var</location>
  281.         <url>$url_orig</url>
  282.     </doc>
  283. EOF
  284. }
  285.  
  286. sub printdtd{
  287. # ubcd.dtd
  288. print UBCDINDEXDTD <<EOF;
  289. <!ELEMENT catalog (utility_info*)>
  290. <!ELEMENT utility_info (utility, description, imagename, dosapp?, category, menu, maintainer, lastupdate, size?, version, doc*)>
  291. <!ELEMENT utility (#PCDATA)>
  292. <!ELEMENT description (#PCDATA)>
  293. <!ELEMENT imagename (#PCDATA)>
  294. <!ELEMENT dosapp (#PCDATA)>
  295. <!ELEMENT category (#PCDATA)>
  296. <!ELEMENT menu (#PCDATA)>
  297. <!ELEMENT maintainer (#PCDATA)>
  298. <!ELEMENT lastupdate (#PCDATA)>
  299. <!ELEMENT size (#PCDATA)>
  300. <!ELEMENT version (#PCDATA)>
  301. <!ELEMENT doc (title, location, url)>
  302. <!ELEMENT title (#PCDATA)>
  303. <!ELEMENT location (#PCDATA)>
  304. <!ELEMENT url (#PCDATA)>
  305. EOF
  306. # end ubcd.dtd
  307. }
  308.  
  309. sub printxsl{
  310. # ubcd.xsl
  311. print UBCDINDEXXST <<EOF;
  312. <?xml version="1.0"?>
  313. <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  314. <xsl:output method="html" encoding="UTF-8"/>
  315.  
  316. <xsl:template match="/">
  317. <html><head><title>UBCD CD Based Docs - XML</title></head>
  318. <body>
  319. <p><b>UBCD CD based documentation</b></p>
  320. <ol>
  321. <xsl:apply-templates mode="TOC"/>
  322. </ol>
  323. <xsl:apply-templates mode="body"/>
  324. </body>
  325. </html>
  326. </xsl:template>
  327.  
  328. <xsl:template match="utility_info" mode="TOC">
  329. <li><a href="{concat('#utility', position())}"><xsl:value-of
  330. select="utility/text()"/></a></li>
  331. </xsl:template>
  332.  
  333. <xsl:template match="utility_info" mode="body">
  334. <p><a name="{concat('utility', position())}"><xsl:value-of
  335. select="text()"/></a></p>
  336. <xsl:apply-templates select="utility"/>
  337. <xsl:text> </xsl:text>
  338. <xsl:apply-templates select="version"/>
  339. <br></br>
  340. <xsl:apply-templates select="description"/>
  341. <br></br>
  342. <xsl:apply-templates select="lastupdate"/>
  343. <xsl:apply-templates select="doc"/>
  344. </xsl:template>
  345.  
  346. <xsl:template match="utility"><b><xsl:value-of
  347. select="text()"/></b></xsl:template>
  348.  
  349. <xsl:template match="description"><xsl:value-of select="text()"/></xsl:template>
  350.  
  351. <!-- this is a way to handle docs. please uncomment this and comment
  352. the doc template under it
  353. <xsl:template match="doc">
  354. <br></br><a href="{url}"><xsl:value-of select="title"/></a>
  355. </xsl:template>
  356. -->
  357.  
  358. <xsl:template match="doc">
  359. <br></br>
  360. <xsl:text>[documentation] </xsl:text>
  361. <xsl:value-of select="title"/>
  362. <xsl:text>: </xsl:text>
  363. <a href="{url}">web</a>
  364. <xsl:text> </xsl:text>
  365. <a href="{location}">local</a>
  366. </xsl:template>
  367.  
  368.  
  369. <xsl:template match="version">Version: <xsl:value-of
  370. select="text()"/></xsl:template>
  371. <xsl:template match="lastupdate">Last Updated: <xsl:value-of
  372. select="text()"/></xsl:template>
  373.  
  374. </xsl:stylesheet>
  375. EOF
  376. # end ubcd.xsl
  377. }
  378.